In [ ]:
# Dr. M. Baron, Statistical Machine Learning class, STAT-427/627
# SUPPORT VECTOR MACHINES
# Import necessary libraries
! pip install pandas;
! pip install numpy;
! pip install scikit-learn;
! pip install matplotlib;
! pip install seaborn;
! pip install ISLP;
import pandas as pd;
import numpy as np;
from sklearn.svm import SVC;
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score;
from sklearn.metrics import classification_report, confusion_matrix;
import matplotlib.pyplot as plt;
import seaborn as sns;
from ISLP import load_data;
# Load the Auto dataset from package ISLP
Auto = load_data('Auto')
In [17]:
# Create Economy (ECO) labels based on mpg
Auto['ECO'] = np.where(Auto['mpg'] > 22.75, 'Economy', 'Consuming')
In [19]:
# Visualize weight vs horsepower
sns.scatterplot(data=Auto, x='weight', y='horsepower', hue='ECO')
plt.xlabel('Weight')
plt.ylabel('Horsepower')
plt.title('Car Classification by Economy')
plt.show()
In [23]:
# Prepare dataset with necessary variables only
d = Auto[['ECO', 'weight', 'horsepower']]
# Perform SVM with linear kernel
X = d[['weight', 'horsepower']]
y = d['ECO']
svm_linear = SVC(kernel='linear', C=1)
svm_linear.fit(X, y)
print(f"Support Vectors:\n{svm_linear.support_vectors_}")
Support Vectors: [[2833. 95.] [2774. 97.] [2587. 85.] [2648. 90.] [2634. 100.] [3302. 88.] [2962. 110.] [2408. 72.] [3139. 88.] [2408. 90.] [2226. 86.] [2330. 97.] [2933. 112.] [2511. 76.] [2979. 87.] [2395. 86.] [2945. 100.] [3021. 88.] [2789. 100.] [2279. 88.] [2401. 72.] [2379. 94.] [2124. 90.] [2310. 85.] [2472. 107.] [2582. 91.] [2868. 112.] [2807. 122.] [3102. 95.] [2901. 100.] [3432. 72.] [3158. 72.] [3039. 110.] [2914. 100.] [2984. 97.] [3211. 90.] [2945. 98.] [3085. 90.] [3193. 95.] [3150. 102.] [3270. 88.] [2930. 108.] [2815. 97.] [2600. 110.] [2720. 110.] [3155. 95.] [2965. 85.] [3210. 90.] [3070. 85.] [2515. 95.] [2830. 103.] [2795. 115.] [2990. 85.] [2890. 88.] [3265. 90.] [3060. 88.] [2835. 112.] [2672. 87.] [2234. 113.] [2506. 97.] [2904. 95.] [2660. 110.] [2489. 97.] [2639. 83.] [2702. 96.] [2545. 97.] [2694. 95.] [2957. 88.] [2671. 115.] [2572. 92.] [3012. 81.] [2740. 88.] [2755. 89.] [2720. 88.] [2560. 95.] [2745. 105.] [2855. 85.] [2670. 80.] [3530. 77.] [3900. 125.] [3190. 71.] [3420. 90.] [2670. 90.] [2595. 115.] [2700. 115.] [2556. 90.] [2678. 90.] [2870. 88.] [3003. 90.] [2711. 90.] [2800. 105.] [2950. 67.] [3250. 67.] [2910. 132.] [2420. 100.] [2635. 84.] [2620. 92.] [2725. 110.] [2615. 100.] [3230. 80.] [3160. 76.] [2900. 116.] [2930. 120.] [3725. 105.] [2605. 88.] [2640. 88.] [2735. 90.] [2865. 92.] [2945. 110.] [3015. 85.] [2585. 92.] [2665. 96.] [2950. 90.] [2790. 86.] [2720. 82.]]
In [25]:
# Plot the SVM with linear kernel
plt.scatter(X['weight'], X['horsepower'], c=y.apply(lambda x: 0 if x == "Consuming" else 1), cmap='coolwarm')
plt.scatter(svm_linear.support_vectors_[:, 0], svm_linear.support_vectors_[:, 1], s=100, facecolors='none', edgecolors='k')
plt.xlabel('Weight')
plt.ylabel('Horsepower')
plt.title('SVM with Linear Kernel')
plt.show()
In [27]:
# Try polynomial, radial, and sigmoid kernels
kernels = ['poly', 'rbf', 'sigmoid']
for kernel in kernels:
svm = SVC(kernel=kernel, C=1)
svm.fit(X, y)
print(f"Kernel: {kernel}")
print(f"Number of Support Vectors: {len(svm.support_vectors_)}")
Kernel: poly Number of Support Vectors: 120 Kernel: rbf Number of Support Vectors: 134 Kernel: sigmoid Number of Support Vectors: 354
In [29]:
# Hyperparameter tuning with cross-validation
param_grid = {'C': np.logspace(-3, 3, 7)}
grid = GridSearchCV(SVC(kernel='linear'), param_grid, cv=10)
grid.fit(X, y)
print(f"Best Parameters: {grid.best_params_}")
print(f"Best Score: {grid.best_score_}")
Best Parameters: {'C': 100.0} Best Score: 0.8953205128205128
In [31]:
# Tuning with different kernels
param_grid_kernels = {
'C': np.logspace(-3, 3, 7),
'kernel': ['linear', 'poly', 'rbf', 'sigmoid']
}
grid_kernels = GridSearchCV(SVC(), param_grid_kernels, cv=10)
grid_kernels.fit(X, y)
print(f"Best Parameters with Kernel Tuning: {grid_kernels.best_params_}")
print(f"Best Score with Kernel Tuning: {grid_kernels.best_score_}")
Best Parameters with Kernel Tuning: {'C': 100.0, 'kernel': 'linear'} Best Score with Kernel Tuning: 0.8953205128205128
In [32]:
# Train final model with optimal parameters
best_model = SVC(C=0.1, kernel='sigmoid')
best_model.fit(X, y)
Out[32]:
SVC(C=0.1, kernel='sigmoid')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
SVC(C=0.1, kernel='sigmoid')
In [33]:
# Evaluate on a validation set
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.5, random_state=1)
model_val = SVC(C=0.1, kernel='sigmoid')
model_val.fit(X_train, y_train)
y_pred = model_val.predict(X_val)
print(confusion_matrix(y_val, y_pred))
print(f"Accuracy: {np.mean(y_pred == y_val)}")
[[ 0 105] [ 0 91]] Accuracy: 0.4642857142857143
In [34]:
# More than two classes
# Create ECO4 categories based on mpg values
Auto['ECO4'] = pd.cut(Auto['mpg'], bins=[0, 17, 22.75, 29, np.inf], labels=['Consuming', 'OK', 'Good', 'Economy'])
d4 = Auto[['ECO4', 'weight', 'horsepower']]
# Train SVM with ECO4 categories
X4 = d4[['weight', 'horsepower']]
y4 = d4['ECO4']
svm_multi = SVC(C=0.1, kernel='sigmoid')
svm_multi.fit(X4, y4)
# Evaluate classification with more than two classes
y_pred_multi = svm_multi.predict(X4)
print(confusion_matrix(y4, y_pred_multi))
print(f"Accuracy with more classes: {np.mean(y_pred_multi == y4)}")
[[ 0 0 99 0] [62 0 33 0] [22 0 79 0] [ 1 0 96 0]] Accuracy with more classes: 0.20153061224489796